Goals of this notebook. Take our best model file:

  • plot weights
  • look at number of parameters
  • check saturation of norms

In [1]:
import pylearn2.utils
import pylearn2.config
import theano
import neukrill_net.dense_dataset
import neukrill_net.utils
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import holoviews as hl
%load_ext holoviews.ipython
import sklearn.metrics


Using gpu device 0: Tesla K40c
:0: FutureWarning: IPython widgets are experimental and may change in the future.
Welcome to the HoloViews IPython extension! (http://ioam.github.io/holoviews/)
Available magics: %compositor, %opts, %params, %view, %%labels, %%opts, %%view
<matplotlib.figure.Figure at 0x7f89ab113510>
<matplotlib.figure.Figure at 0x7f89ab113bd0>
<matplotlib.figure.Figure at 0x7f89ab1139d0>

At the time of writing our best model is defined by the run settings file alexnet_based_40aug.json, basically taking the AlexNet based architecture with an extra convolutional layer and using more augmentation. Full details are in the following YAML file:


In [2]:
cd ..


/afs/inf.ed.ac.uk/user/s08/s0805516/repos/neukrill-net-work

In [3]:
cat yaml_templates/alexnet_based_extra_convlayer.yaml


!obj:pylearn2.train.Train {
    dataset: &train !obj:neukrill_net.dense_dataset.DensePNGDataset {
        settings_path: %(settings_path)s,
        run_settings: %(run_settings_path)s,
        training_set_mode: "train"
    },
    model: !obj:pylearn2.models.mlp.MLP {
        batch_size: &batch_size 128,
        input_space: !obj:pylearn2.space.Conv2DSpace {
            shape: %(final_shape)s,
            num_channels: 1,
            axes: ['c', 0, 1, 'b'],
        },
        layers: [ !obj:pylearn2.models.mlp.ConvRectifiedLinear {
                     layer_name: h1,
                     output_channels: 48,
                     irange: .025,
                     init_bias: 0,
                     kernel_shape: [8, 8],
                     pool_shape: [2, 2],
                     pool_stride: [2, 2],
                     max_kernel_norm: 1.9365
                 },!obj:pylearn2.models.mlp.ConvRectifiedLinear {
                     layer_name: h2,
                     output_channels: 96,
                     irange: .025,
                     init_bias: 1,
                     kernel_shape: [5, 5],
                     pool_shape: [2, 2],
                     pool_stride: [2, 2],
                     max_kernel_norm: 1.9365
                 }, !obj:pylearn2.models.mlp.ConvRectifiedLinear {
                     layer_name: h3,
                     output_channels: 128,
                     irange: .025,
                     init_bias: 0,
                     kernel_shape: [3, 3],
                     border_mode: full,
                     pool_shape: [1, 1],
                     pool_stride: [1, 1],
                     max_kernel_norm: 1.9365
                  }, !obj:pylearn2.models.mlp.ConvRectifiedLinear {
                     layer_name: h4,
                     output_channels: 192,
                     irange: .025,
                     init_bias: 0,
                     kernel_shape: [3, 3],
                     border_mode: full,
                     pool_shape: [1, 1],
                     pool_stride: [1, 1],
                     max_kernel_norm: 1.9365
                 }, !obj:pylearn2.models.mlp.ConvRectifiedLinear {
                     layer_name: 'h5',
                     output_channels: 128,
                     irange: .025,
                     init_bias: 1,
                     kernel_shape: [3, 3],
                     border_mode: full,
                     pool_shape: [2, 2],
                     pool_stride: [2, 2],
                     max_kernel_norm: 1.9365
                 }, !obj:pylearn2.models.mlp.RectifiedLinear {
                     dim: 1024,
                     max_col_norm: 1.9,
                     layer_name: h6,
                     istdev: .05,
                     W_lr_scale: .25,
                     b_lr_scale: .25
                 }, !obj:pylearn2.models.mlp.Softmax {
                     n_classes: %(n_classes)i,
                     max_col_norm: 1.9365,
                     layer_name: y,
                     istdev: .05,
                     W_lr_scale: .25,
                     b_lr_scale: .25
                 }
                ],
    },
    algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
        train_iteration_mode: even_shuffled_sequential,
        monitor_iteration_mode: even_sequential,
        batch_size: *batch_size,
        learning_rate: .1,
        learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
            init_momentum: 0.5
        },
        monitoring_dataset: {
                'train': *train,
                'valid' : !obj:neukrill_net.dense_dataset.DensePNGDataset  {
                                settings_path: %(settings_path)s,
                                run_settings: %(run_settings_path)s,
                                training_set_mode: "validation"
            },
        },
        cost: !obj:pylearn2.costs.cost.SumOfCosts { costs: [ 
            !obj:pylearn2.costs.mlp.dropout.Dropout {
                input_include_probs: {
                    h1 : 1.,
                    h2 : 1.,
                    h3 : 1.,
                    h4 : 1.,
                    h5 : 1.,
                    h6 : 0.5
                },
                input_scales: {
                    h1 : 1.,
                    h2 : 1.,
                    h3 : 1.,
                    h4 : 1.,
                    h5 : 1.,
                    h6 : 2.
                }
             },
             !obj:pylearn2.costs.mlp.WeightDecay {
                 coeffs : {
                     h1 : .00005,
                     h2 : .00005,
                     h3 : .00005,
                     h4 : .00005,
                     h5 : .00005,
                     h6 : .00005
                 }
             }
             ]
        },
        termination_criterion: !obj:pylearn2.termination_criteria.And {
            criteria: [
                !obj:pylearn2.termination_criteria.EpochCounter {
                    max_epochs: 500
                },
            ]
        }
    },
    extensions: [
        !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
            start: 1,
            saturate: 25,
            final_momentum: 0.95
        },
        !obj:pylearn2.training_algorithms.sgd.LinearDecayOverEpoch {
            start: 1,
            saturate: 25,
            decay_factor: 0.025
        },
        !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
             channel_name: valid_objective,
             save_path: '%(save_path)s'
        },
        !obj:pylearn2.training_algorithms.sgd.MonitorBasedLRAdjuster {
            high_trigger: 1.,
            low_trigger: 0.999,
            grow_amt: 1.1,
            shrink_amt: 0.9,
            max_lr: 0.2,
            min_lr: 1e-5,
            channel_name: valid_y_misclass
        }
    ],
    save_path: '%(alt_picklepath)s',
    save_freq: 1
}

It has relatively few MLP layers, so maybe we should look at where the parameters in our model are distributed; comparing the MLP layers to the convolutional ones.


In [4]:
settings = neukrill_net.utils.Settings("settings.json")
run_settings = neukrill_net.utils.load_run_settings(
    "run_settings/quicker_learning_1_fc_layer_experiment_no_norms_repeat.json", settings, force=True)

In [5]:
model = pylearn2.utils.serial.load(run_settings["pickle abspath"])

In [6]:
params = model.get_params()

In [7]:
params[0].name


Out[7]:
'h1_W'

In [8]:
total_params = sum(map(lambda x: x.get_value().size,params))
print("Total parameters: {0}".format(total_params))


Total parameters: 48287340

In [9]:
for l in params:
    print("Layer {0}: {1} parameters".format(l.name,l.get_value().size))


Layer h1_W: 1200 parameters
Layer h1_b: 221952 parameters
Layer h2_W: 55296 parameters
Layer h2_b: 165888 parameters
Layer h3_W: 147456 parameters
Layer h3_b: 184832 parameters
Layer h4_W: 47316992 parameters
Layer h4_b: 1024 parameters
Layer softmax_b: 2 parameters
Layer softmax_W: 2048 parameters
Layer softmax_b: 4 parameters
Layer softmax_W: 4096 parameters
Layer softmax_b: 7 parameters
Layer softmax_W: 7168 parameters
Layer softmax_b: 16 parameters
Layer softmax_W: 16384 parameters
Layer softmax_b: 38 parameters
Layer softmax_W: 38912 parameters
Layer softmax_b: 121 parameters
Layer softmax_W: 123904 parameters

In [10]:
for l in params:
    print("Layer {0}: {1}% of the parameters.".format(l.name,
                        100*(float(l.get_value().size)/total_params)))


Layer h1_W: 0.00248512342987% of the parameters.
Layer h1_b: 0.459648429588% of the parameters.
Layer h2_W: 0.114514487648% of the parameters.
Layer h2_b: 0.343543462945% of the parameters.
Layer h3_W: 0.305371967062% of the parameters.
Layer h3_b: 0.382775278158% of the parameters.
Layer h4_W: 97.9904712084% of the parameters.
Layer h4_b: 0.00212063866015% of the parameters.
Layer softmax_b: 4.14187238311e-06% of the parameters.
Layer softmax_W: 0.00424127732031% of the parameters.
Layer softmax_b: 8.28374476623e-06% of the parameters.
Layer softmax_W: 0.00848255464062% of the parameters.
Layer softmax_b: 1.44965533409e-05% of the parameters.
Layer softmax_W: 0.0148444706211% of the parameters.
Layer softmax_b: 3.31349790649e-05% of the parameters.
Layer softmax_W: 0.0339302185625% of the parameters.
Layer softmax_b: 7.86955752792e-05% of the parameters.
Layer softmax_W: 0.0805842690859% of the parameters.
Layer softmax_b: 0.000250583279178% of the parameters.
Layer softmax_W: 0.256597277879% of the parameters.

The reason we probably see little difference adding extra MLP layers is that the weight matrix leading into the first MLP layer is so much more massive than between the MLP layers themselves. Adding more MLP layers barely increases the number of parameters.

Receptive Fields/Kernels

Looking at the Kernels we've learnt using the script provided by Pylearn2.


In [11]:
%env PYLEARN2_VIEWER_COMMAND=/afs/inf.ed.ac.uk/user/s08/s0805516/repos/neukrill-net-work/image_hack.sh


env: PYLEARN2_VIEWER_COMMAND=/afs/inf.ed.ac.uk/user/s08/s0805516/repos/neukrill-net-work/image_hack.sh

In [12]:
%run ~/repos/pylearn2/pylearn2/scripts/show_weights.py /disk/scratch/neuroglycerin/models/quicker_learning_1_fc_layer_experiment_no_norms_repeat_recent.pkl


making weights report
loading model
loading done
smallest enc weight magnitude: 3.95500101149e-05
mean enc weight magnitude: 0.101006127894
max enc weight magnitude: 1.06396627426

In [13]:
from IPython.display import Image

In [14]:
def plot_recent_pylearn2():
    pl2plt = Image(filename="/afs/inf.ed.ac.uk/user/s08/s0805516/tmp/pylearnplot.png", width=500)
    return pl2plt
plot_recent_pylearn2()


Out[14]:

Trying to repeat this, but do it smarter, with holoviews.


In [16]:
weights = model.get_weights_topo()

In [31]:
#%%opts HeatMap style(cmap='Greys')
heatmaps = None
for w in weights:
    w = {(i,j):w[i,j][0] for i in range(w.shape[0]) for j in range(w.shape[1])}
    if heatmaps == None:
        heatmaps = hl.HeatMap(w)
    else:
        heatmaps += hl.HeatMap(w)
heatmaps


Out[31]:

In [121]:
maxes = []
for irange in np.logspace(-5,-0.5,50):
    W = ((np.random.rand(1000,5,5,1)*2)-1)*irange
    maxes.append(np.mean(np.sqrt(np.sum(W**2,axis=(1,2,3)))))

In [124]:
plt.xlabel("irange")
plt.ylabel("max_kernel_norm")
plt.plot(np.logspace(-5,-1,50),maxes)
plt.grid()



In [128]:
maxes = []
s = np.logspace(-4,-1,50)
for irange in s:
    W = ((np.random.rand(10,3,3,48)*2)-1)*irange
    maxes.append(np.mean(np.sqrt(np.sum(W**2,axis=(1,2,3)))))

In [129]:
plt.xlabel("irange")
plt.ylabel("max_kernel_norm")
plt.plot(s,maxes)
plt.grid()



In [134]:
maxes = []
s = np.logspace(-4,-1.2,50)
for irange in s:
    W = ((np.random.rand(10,3,3,128)*2)-1)*irange
    maxes.append(np.mean(np.sqrt(np.sum(W**2,axis=(1,2,3)))))

In [135]:
plt.xlabel("irange")
plt.ylabel("max_kernel_norm")
plt.plot(s,maxes)
plt.grid()



In [3]:
10*46208*1024


Out[3]:
473169920

In [15]:
maxes = []
s = np.logspace(-5,-2,50)
for irange in s:
    W = np.random.randn(10,46208)*irange
    maxes.append(np.mean(np.sqrt(np.sum(W**2,axis=1))))

In [16]:
plt.xlabel("irange")
plt.ylabel("max_kernel_norm")
plt.plot(s,maxes)
plt.grid()



In [19]:
# MLP layer
maxnorm = 0
istdev = 0.0005
while maxnorm < 0.3:
    W = np.random.randn(1024,46208)*istdev
    maxnorm = np.mean(np.sqrt(np.sum(W**2,axis=1)))
    istdev = istdev*1.1
print("istdev should be approximately {0}".format(istdev))


istdev should be approximately 0.00156921418836